library(tidyverse)
library(data.table)
library(tigris)
library(eiCompare)
library(future) 
library(dplyr)
library(openxlsx)
library(flextable)
set_flextable_defaults(font.family = "EB Garamond")

########TABLE 1
#load data. 
#This is the aggregate file produced after combining name dictionaries with WA vote data
#each row should be a race group x election combination.
all_elections <- fread("/Users/michaelherndon/Library/CloudStorage/Dropbox/Signature Matching/Data and Analysis/Kassra Analysis/PRQ Audit Replication/all_elections.csv")

#this is where I clean up the racial CATEGORIES
#currently set to compare anyone with a hispanic surname against
#anyone with a white surname
all_elections <- all_elections %>%
  mutate(
    Category = case_when(
      #spanish_bothname == 1 ~ "Spanish Both Names",
      spanish_surname == 1 | spanish_bothname == 1 ~ "Spanish Surname",
      spanish_firstname == 1 & spanish_bothname == 0  ~ "Spanish First Name",
      #white_bothname == 1 ~ "White Both Names",
      white_surname == 1 | white_bothname == 1 ~ "White Surname",
      white_firstname == 1 & white_bothname == 0  ~ "White First Name",
      asian_bothname == 1 ~ "Asian Both Names",
      asian_surname == 1 & asian_bothname == 0 ~ "Asian Surname",
      asian_firstname == 1 & asian_bothname == 0 ~ "Asian First Name",
      black_bothname == 1 ~ "Black Both Names", 
      black_surname == 1 & black_bothname == 0 ~ "Black Surname",
      black_firstname == 1 & black_bothname == 0 ~ "Black First Name"
    ))


#de
general_elections <- c(
  "General Nov  5 2019",
  "General Nov  3 2020",
  "General Nov  2 2021",
  "General Nov  8 2022",
  "General Nov  7 2023",
  "General Nov  5 2024"
)


central_wa <- c("Benton", "Chelan", "Yakima", "Okanogan", "Douglas", "Grant", "Adams", "Franklin", "Walla Walla")

#GROUPING BY COUNTY
all_elections_summary <- all_elections %>%
  filter(Election %in% general_elections,
         Category %in% c("Spanish Surname", "White Surname")) %>%
  filter(County %in% c(central_wa)) %>% #this is where I select certain counties
  group_by(County, Category) %>%
  summarise(
    total_late_rejected = sum(too_late_rej, na.rm = TRUE),
    total_sig_rejected = sum(sig_mismatch_rej, na.rm = TRUE),
    total_ballots = sum(total_votes, na.rm = TRUE),
    late_rate = total_late_rejected / total_ballots,
    sig_rate = total_sig_rejected / total_ballots
  ) %>%
  pivot_wider(
    names_from = Category,
    values_from = c(total_late_rejected, total_sig_rejected, total_ballots, late_rate, sig_rate),
    names_glue = "{.value}_{Category}"
  ) %>%
  mutate(
    late_rejection_ratio = `late_rate_Spanish Surname` / `late_rate_White Surname`,
    sig_rejection_ratio = `sig_rate_Spanish Surname` / `sig_rate_White Surname`,
    delta_hispanic = abs(`sig_rate_Spanish Surname`-`late_rate_Spanish Surname`),
    delta_white = abs(`sig_rate_White Surname` -`late_rate_White Surname`)
  ) %>%
  ungroup() # Ungroup to make it a single data frame with one row per election

# Create a row that combines all central WA counties
combined_row <- all_elections %>%
  filter(Election %in% general_elections,
         Category %in% c("Spanish Surname", "White Surname")) %>%
  filter(County %in% c(central_wa)) %>%
  group_by(Category) %>%
  summarise(
    total_late_rejected = sum(too_late_rej, na.rm = TRUE),
    total_sig_rejected = sum(sig_mismatch_rej, na.rm = TRUE),
    total_ballots = sum(total_votes, na.rm = TRUE),
    late_rate = total_late_rejected / total_ballots,
    sig_rate = total_sig_rejected / total_ballots
  ) %>%
  pivot_wider(
    names_from = Category,
    values_from = c(total_late_rejected, total_sig_rejected, total_ballots, late_rate, sig_rate),
    names_glue = "{.value}_{Category}"
  ) %>%
  mutate(
    late_rejection_ratio = `late_rate_Spanish Surname` / `late_rate_White Surname`,
    sig_rejection_ratio = `sig_rate_Spanish Surname` / `sig_rate_White Surname`,
    delta_hispanic = abs(`sig_rate_Spanish Surname`-`late_rate_Spanish Surname`),
    delta_white = abs(`sig_rate_White Surname` -`late_rate_White Surname`),
    County = "Central WA"
  )

#create a row that is statewide
# Create a row that combines all central WA counties
combined_row_statewide <- all_elections %>%
  filter(Election %in% general_elections,
         Category %in% c("Spanish Surname", "White Surname")) %>%
  #filter(County %in% c(central_wa)) %>%
  group_by(Category) %>%
  summarise(
    total_late_rejected = sum(too_late_rej, na.rm = TRUE),
    total_sig_rejected = sum(sig_mismatch_rej, na.rm = TRUE),
    total_ballots = sum(total_votes, na.rm = TRUE),
    late_rate = total_late_rejected / total_ballots,
    sig_rate = total_sig_rejected / total_ballots
  ) %>%
  pivot_wider(
    names_from = Category,
    values_from = c(total_late_rejected, total_sig_rejected, total_ballots, late_rate, sig_rate),
    names_glue = "{.value}_{Category}"
  ) %>%
  mutate(
    late_rejection_ratio = `late_rate_Spanish Surname` / `late_rate_White Surname`,
    sig_rejection_ratio = `sig_rate_Spanish Surname` / `sig_rate_White Surname`,
    delta_hispanic = abs(`sig_rate_Spanish Surname`-`late_rate_Spanish Surname`),
    delta_white = abs(`sig_rate_White Surname` -`late_rate_White Surname`),
    County = "All Counties"
  )

# Bind the combined row to the summary table
all_elections_summary <- bind_rows(all_elections_summary, combined_row, combined_row_statewide)

# Function to run a two-proportion z-test and return p-value and z-score
two_prop_z_test <- function(delta1, delta2, n1, n2) {
  # Calculate the standard error for the difference in proportions
  se_diff <- sqrt((delta1 * (1 - delta1) / n1) + (delta2 * (1 - delta2) / n2))
  # Calculate the z-score
  z_score <- (delta1 - delta2) / se_diff
  # Calculate the p-value (two-tailed test)
  p_value <- 2 * (1 - pnorm(abs(z_score)))
  return(c(p_value, z_score))
}

# Apply the z-test for each row
county_summary <- all_elections_summary %>%
  rowwise() %>%
  mutate(
    # Test for difference between the deltas (White vs. Hispanic)
    test_results = list(two_prop_z_test(
      `delta_white`, `delta_hispanic`, 
      `total_ballots_White Surname`, `total_ballots_Spanish Surname`
    )),
    # Extract p-value and z-score from the result list
    prop_test_p_value = test_results[[1]],
    z_score = test_results[[2]]
  ) %>%
  ungroup() %>%
  filter(`total_sig_rejected_White Surname` > 1 & `total_sig_rejected_Spanish Surname`> 1 & 
           `total_late_rejected_White Surname` > 1 & 
           `total_late_rejected_Spanish Surname` > 1) %>%
  select(
    #Election,
    County,
    `White Total Ballots` = `total_ballots_White Surname`,
    `Hispanic Total Ballots` = `total_ballots_Spanish Surname`,
    `White Late Rejection Rate` = `late_rate_White Surname`,
    `White Signature Rejection Rate` = `sig_rate_White Surname` ,
    `White Delta` = delta_white,
    `Hispanic Late Rejection Rate` = `late_rate_Spanish Surname`,
    `Hispanic Signature Rejection Rate` = `sig_rate_Spanish Surname`,
    `Hispanic Delta` = delta_hispanic, 
    `White:Hispanic Late Rejection Ratio` = late_rejection_ratio,
    `White:Hispanic Signature Rejection Ratio` = sig_rejection_ratio,
    #`Z Score` = z_score,
    `p-value (Two-Delta Z-test)` = prop_test_p_value
    
  )

# Round numeric columns 
county_summary[] <- lapply(county_summary, function(x) {
  if (is.numeric(x)) round(x, 3) else x
})

#order counties and make table
ft <- county_summary %>%
  mutate(County = factor(
   County,
   levels = c(
   "All Counties", "Central WA","Adams", "Benton", "Chelan", "Douglas", "Franklin", "Grant", "Okanogan", "Walla Walla", "Yakima"))) %>%

  arrange(County) %>%
  flextable() %>%
  fontsize(size = 8, part = "all") %>%
  font(fontname = "EB Garamond", part = "all")  %>%
  mk_par(j = "White Signature Rejection Rate", part = "body",
         value = as_paragraph(as_chunk(`White Signature Rejection Rate`, formatter = fmt_pct))) %>%
  mk_par(j = "Hispanic Signature Rejection Rate", part = "body",
         value = as_paragraph(as_chunk(`Hispanic Signature Rejection Rate`, formatter = fmt_pct))) %>%
  mk_par(j = "Hispanic Late Rejection Rate", part = "body",
         value = as_paragraph(as_chunk(`Hispanic Late Rejection Rate`, formatter = fmt_pct))) %>%
  mk_par(j = "White Late Rejection Rate", part = "body",
         value = as_paragraph(as_chunk(`White Late Rejection Rate`, formatter = fmt_pct))) %>%
  mk_par(j = "Hispanic Delta", part = "body",
         value = as_paragraph(as_chunk(`Hispanic Delta`, formatter = fmt_pct))) %>%
  mk_par(j = "White Delta", part = "body",
         value = as_paragraph(as_chunk(`White Delta`, formatter = fmt_pct)))

ft
#Table 2 FIRST AND LAST NAMES
#####################
#####################
#####################
#####################
#####################
#####################
#load data. 
#This is the aggregate file produced after combining name dictionaries with WA vote data
#each row should be a race group x election combination.
all_elections <- fread("/Users/michaelherndon/Library/CloudStorage/Dropbox/Signature Matching/Data and Analysis/Kassra Analysis/PRQ Audit Replication/all_elections.csv")

#this is where I clean up the racial CATEGORIES
#currently set to compare anyone with a hispanic BOTH NAMES against
#anyone with a white BOTH NAMES
all_elections <- all_elections %>%
  mutate(
    Category = case_when(
      spanish_bothname == 1 ~ "Spanish Both Names",
      #spanish_surname == 1 | spanish_bothname == 1 ~ "Spanish Surname",
      spanish_firstname == 1 & spanish_bothname == 0  ~ "Spanish First Name",
      white_bothname == 1 ~ "White Both Names",
      #white_surname == 1 | white_bothname == 1 ~ "White Surname",
      white_firstname == 1 & white_bothname == 0  ~ "White First Name",
      asian_bothname == 1 ~ "Asian Both Names",
      asian_surname == 1 & asian_bothname == 0 ~ "Asian Surname",
      asian_firstname == 1 & asian_bothname == 0 ~ "Asian First Name",
      black_bothname == 1 ~ "Black Both Names", 
      black_surname == 1 & black_bothname == 0 ~ "Black Surname",
      black_firstname == 1 & black_bothname == 0 ~ "Black First Name"
    ))

#GROUPING BY COUNTY
all_elections_summary <- all_elections %>%
  filter(Election %in% general_elections,
         Category %in% c("Spanish Both Names", "White Both Names")) %>%
  filter(County %in% c(central_wa)) %>% #this is where I select certain counties
  group_by(County, Category) %>%
  summarise(
    total_late_rejected = sum(too_late_rej, na.rm = TRUE),
    total_sig_rejected = sum(sig_mismatch_rej, na.rm = TRUE),
    total_ballots = sum(total_votes, na.rm = TRUE),
    late_rate = total_late_rejected / total_ballots,
    sig_rate = total_sig_rejected / total_ballots
  ) %>%
  pivot_wider(
    names_from = Category,
    values_from = c(total_late_rejected, total_sig_rejected, total_ballots, late_rate, sig_rate),
    names_glue = "{.value}_{Category}"
  ) %>%
  mutate(
    late_rejection_ratio = `late_rate_Spanish Both Names` / `late_rate_White Both Names`,
    sig_rejection_ratio = `sig_rate_Spanish Both Names` / `sig_rate_White Both Names`,
    delta_hispanic = abs(`sig_rate_Spanish Both Names`-`late_rate_Spanish Both Names`),
    delta_white = abs(`sig_rate_White Both Names` -`late_rate_White Both Names`)
  ) %>%
  ungroup() # Ungroup to make it a single data frame with one row per election

# Create a row that combines all central WA counties
combined_row <- all_elections %>%
  filter(Election %in% general_elections,
         Category %in% c("Spanish Both Names", "White Both Names")) %>%
  filter(County %in% c(central_wa)) %>%
  group_by(Category) %>%
  summarise(
    total_late_rejected = sum(too_late_rej, na.rm = TRUE),
    total_sig_rejected = sum(sig_mismatch_rej, na.rm = TRUE),
    total_ballots = sum(total_votes, na.rm = TRUE),
    late_rate = total_late_rejected / total_ballots,
    sig_rate = total_sig_rejected / total_ballots
  ) %>%
  pivot_wider(
    names_from = Category,
    values_from = c(total_late_rejected, total_sig_rejected, total_ballots, late_rate, sig_rate),
    names_glue = "{.value}_{Category}"
  ) %>%
  mutate(
    late_rejection_ratio = `late_rate_Spanish Both Names` / `late_rate_White Both Names`,
    sig_rejection_ratio = `sig_rate_Spanish Both Names` / `sig_rate_White Both Names`,
    delta_hispanic = abs(`sig_rate_Spanish Both Names`-`late_rate_Spanish Both Names`),
    delta_white = abs(`sig_rate_White Both Names` -`late_rate_White Both Names`),
    County = "Central WA"
  )

#create a row that is statewide
# Create a row that combines all central WA counties
combined_row_statewide <- all_elections %>%
  filter(Election %in% general_elections,
         Category %in% c("Spanish Both Names", "White Both Names")) %>%
  #filter(County %in% c(central_wa)) %>%
  group_by(Category) %>%
  summarise(
    total_late_rejected = sum(too_late_rej, na.rm = TRUE),
    total_sig_rejected = sum(sig_mismatch_rej, na.rm = TRUE),
    total_ballots = sum(total_votes, na.rm = TRUE),
    late_rate = total_late_rejected / total_ballots,
    sig_rate = total_sig_rejected / total_ballots
  ) %>%
  pivot_wider(
    names_from = Category,
    values_from = c(total_late_rejected, total_sig_rejected, total_ballots, late_rate, sig_rate),
    names_glue = "{.value}_{Category}"
  ) %>%
  mutate(
    late_rejection_ratio = `late_rate_Spanish Both Names` / `late_rate_White Both Names`,
    sig_rejection_ratio = `sig_rate_Spanish Both Names` / `sig_rate_White Both Names`,
    delta_hispanic = abs(`sig_rate_Spanish Both Names`-`late_rate_Spanish Both Names`),
    delta_white = abs(`sig_rate_White Both Names` -`late_rate_White Both Names`),
    County = "All Counties"
  )

# Bind the combined row to the summary table
all_elections_summary <- bind_rows(all_elections_summary, combined_row, combined_row_statewide)

# Function to run a two-proportion z-test and return p-value and z-score
two_prop_z_test <- function(delta1, delta2, n1, n2) {
  # Calculate the standard error for the difference in proportions
  se_diff <- sqrt((delta1 * (1 - delta1) / n1) + (delta2 * (1 - delta2) / n2))
  # Calculate the z-score
  z_score <- (delta1 - delta2) / se_diff
  # Calculate the p-value (two-tailed test)
  p_value <- 2 * (1 - pnorm(abs(z_score)))
  return(c(p_value, z_score))
}

# Apply the z-test for each row
county_summary <- all_elections_summary %>%
  rowwise() %>%
  mutate(
    # Test for difference between the deltas (White vs. Hispanic)
    test_results = list(two_prop_z_test(
      `delta_white`, `delta_hispanic`, 
      `total_ballots_White Both Names`, `total_ballots_Spanish Both Names`
    )),
    # Extract p-value and z-score from the result list
    prop_test_p_value = test_results[[1]],
    z_score = test_results[[2]]
  ) %>%
  ungroup() %>%
  filter(`total_sig_rejected_White Both Names` > 1 & `total_sig_rejected_Spanish Both Names`> 1 & 
           `total_late_rejected_White Both Names` > 1 & 
           `total_late_rejected_Spanish Both Names` > 1) %>%
  select(
    #Election,
    County,
    `White Total Ballots` = `total_ballots_White Both Names`,
    `Hispanic Total Ballots` = `total_ballots_Spanish Both Names`,
    `White Late Rejection Rate` = `late_rate_White Both Names`,
    `White Signature Rejection Rate` = `sig_rate_White Both Names` ,
    `White Delta` = delta_white,
    `Hispanic Late Rejection Rate` = `late_rate_Spanish Both Names`,
    `Hispanic Signature Rejection Rate` = `sig_rate_Spanish Both Names`,
    `Hispanic Delta` = delta_hispanic, 
    `White:Hispanic Late Rejection Ratio` = late_rejection_ratio,
    `White:Hispanic Signature Rejection Ratio` = sig_rejection_ratio,
    #`Z Score` = z_score,
    `p-value (Two-Delta Z-test)` = prop_test_p_value
    
  )

# Round numeric columns 
county_summary[] <- lapply(county_summary, function(x) {
  if (is.numeric(x)) round(x, 3) else x
})

#order counties and make table
ft <- county_summary %>%
  mutate(County = factor(
    County,
    levels = c(
      "All Counties", "Central WA","Adams", "Benton", "Chelan", "Douglas", "Franklin", "Grant", "Okanogan", "Walla Walla", "Yakima"))) %>%
  
  arrange(County) %>%
  flextable() %>%
  fontsize(size = 8, part = "all") %>%
  font(fontname = "EB Garamond", part = "all")  %>%
  mk_par(j = "White Signature Rejection Rate", part = "body",
         value = as_paragraph(as_chunk(`White Signature Rejection Rate`, formatter = fmt_pct))) %>%
  mk_par(j = "Hispanic Signature Rejection Rate", part = "body",
         value = as_paragraph(as_chunk(`Hispanic Signature Rejection Rate`, formatter = fmt_pct))) %>%
  mk_par(j = "Hispanic Late Rejection Rate", part = "body",
         value = as_paragraph(as_chunk(`Hispanic Late Rejection Rate`, formatter = fmt_pct))) %>%
  mk_par(j = "White Late Rejection Rate", part = "body",
         value = as_paragraph(as_chunk(`White Late Rejection Rate`, formatter = fmt_pct))) %>%
  mk_par(j = "Hispanic Delta", part = "body",
         value = as_paragraph(as_chunk(`Hispanic Delta`, formatter = fmt_pct))) %>%
  mk_par(j = "White Delta", part = "body",
         value = as_paragraph(as_chunk(`White Delta`, formatter = fmt_pct)))

ft

#Table A1 Surnames and grouped by election
#####################
#####################
#####################
#####################
#####################
#####################
#GROUPING BY Election
#####################
#####################
#####################
#####################
#####################
#####################
all_elections <- all_elections %>%
  mutate(
    Category = case_when(
      #spanish_bothname == 1 ~ "Spanish Both Names",
      spanish_surname == 1 | spanish_bothname == 1 ~ "Spanish Surname",
      spanish_firstname == 1 & spanish_bothname == 0  ~ "Spanish First Name",
      #white_bothname == 1 ~ "White Both Names",
      white_surname == 1 | white_bothname == 1 ~ "White Surname",
      white_firstname == 1 & white_bothname == 0  ~ "White First Name",
      asian_bothname == 1 ~ "Asian Both Names",
      asian_surname == 1 & asian_bothname == 0 ~ "Asian Surname",
      asian_firstname == 1 & asian_bothname == 0 ~ "Asian First Name",
      black_bothname == 1 ~ "Black Both Names", 
      black_surname == 1 & black_bothname == 0 ~ "Black Surname",
      black_firstname == 1 & black_bothname == 0 ~ "Black First Name"
    ))

#GROUPING BY ELECTION
all_elections_summary <- all_elections %>%
  filter(Election %in% general_elections,
         Category %in% c("Spanish Surname", "White Surname")) %>%
  filter(County %in% c(central_wa)) %>% #this is where I select certain counties
  group_by(Election, Category) %>%
  summarise(
    total_late_rejected = sum(too_late_rej, na.rm = TRUE),
    total_sig_rejected = sum(sig_mismatch_rej, na.rm = TRUE),
    total_ballots = sum(total_votes, na.rm = TRUE),
    late_rate = total_late_rejected / total_ballots,
    sig_rate = total_sig_rejected / total_ballots
  ) %>%
  pivot_wider(
    names_from = Category,
    values_from = c(total_late_rejected, total_sig_rejected, total_ballots, late_rate, sig_rate),
    names_glue = "{.value}_{Category}"
  ) %>%
  mutate(
    late_rejection_ratio = `late_rate_Spanish Surname` / `late_rate_White Surname`,
    sig_rejection_ratio = `sig_rate_Spanish Surname` / `sig_rate_White Surname`,
    delta_hispanic = abs(`sig_rate_Spanish Surname`-`late_rate_Spanish Surname`),
    delta_white = abs(`sig_rate_White Surname` -`late_rate_White Surname`)
  ) %>%
  ungroup() # Ungroup to make it a single data frame with one row per election

# Create a row that combines all elections
combined_row <- all_elections %>%
  filter(Election %in% general_elections,
         Category %in% c("Spanish Surname", "White Surname")) %>%
  filter(County %in% c(central_wa)) %>%
  group_by(Category) %>%
  summarise(
    total_late_rejected = sum(too_late_rej, na.rm = TRUE),
    total_sig_rejected = sum(sig_mismatch_rej, na.rm = TRUE),
    total_ballots = sum(total_votes, na.rm = TRUE),
    late_rate = total_late_rejected / total_ballots,
    sig_rate = total_sig_rejected / total_ballots
  ) %>%
  pivot_wider(
    names_from = Category,
    values_from = c(total_late_rejected, total_sig_rejected, total_ballots, late_rate, sig_rate),
    names_glue = "{.value}_{Category}"
  ) %>%
  mutate(
    late_rejection_ratio = `late_rate_Spanish Surname` / `late_rate_White Surname`,
    sig_rejection_ratio = `sig_rate_Spanish Surname` / `sig_rate_White Surname`,
    delta_hispanic = abs(`sig_rate_Spanish Surname`-`late_rate_Spanish Surname`),
    delta_white = abs(`sig_rate_White Surname` -`late_rate_White Surname`),
    Election = "All Elections"
  )


# Bind the combined row to the summary table
all_elections_summary <- bind_rows(all_elections_summary, combined_row)

# Function to run a two-proportion z-test and return p-value and z-score
two_prop_z_test <- function(delta1, delta2, n1, n2) {
  # Calculate the standard error for the difference in proportions
  se_diff <- sqrt((delta1 * (1 - delta1) / n1) + (delta2 * (1 - delta2) / n2))
  # Calculate the z-score
  z_score <- (delta1 - delta2) / se_diff
  # Calculate the p-value (two-tailed test)
  p_value <- 2 * (1 - pnorm(abs(z_score)))
  return(c(p_value, z_score))
}

# Apply the z-test for each row
county_summary <- all_elections_summary %>%
  rowwise() %>%
  mutate(
    # Test for difference between the deltas (White vs. Hispanic)
    test_results = list(two_prop_z_test(
      `delta_white`, `delta_hispanic`, 
      `total_ballots_White Surname`, `total_ballots_Spanish Surname`
    )),
    # Extract p-value and z-score from the result list
    prop_test_p_value = test_results[[1]],
    z_score = test_results[[2]]
  ) %>%
  ungroup() %>%
  filter(`total_sig_rejected_White Surname` > 1 & `total_sig_rejected_Spanish Surname`> 1 & 
           `total_late_rejected_White Surname` > 1 & 
           `total_late_rejected_Spanish Surname` > 1) %>%
  select(
    Election,
    #County,
    `White Total Ballots` = `total_ballots_White Surname`,
    `Hispanic Total Ballots` = `total_ballots_Spanish Surname`,
    `White Late Rejection Rate` = `late_rate_White Surname`,
    `White Signature Rejection Rate` = `sig_rate_White Surname` ,
    `White Delta` = delta_white,
    `Hispanic Late Rejection Rate` = `late_rate_Spanish Surname`,
    `Hispanic Signature Rejection Rate` = `sig_rate_Spanish Surname`,
    `Hispanic Delta` = delta_hispanic, 
    `White:Hispanic Late Rejection Ratio` = late_rejection_ratio,
    `White:Hispanic Signature Rejection Ratio` = sig_rejection_ratio,
    #`Z Score` = z_score,
    `p-value (Two-Delta Z-test)` = prop_test_p_value
    
  )

# Round numeric columns 
county_summary[] <- lapply(county_summary, function(x) {
  if (is.numeric(x)) round(x, 3) else x
})

#order counties and make table
ft <- county_summary %>%
  mutate(Election = factor(
    Election,
    levels = c(
      "All Elections", "General Nov  5 2019",
      "General Nov  3 2020",
      "General Nov  2 2021",
      "General Nov  8 2022",
      "General Nov  7 2023",
      "General Nov  5 2024"))) %>%
  
  arrange(Election) %>%
  flextable() %>%
  fontsize(size = 8, part = "all") %>%
  font(fontname = "EB Garamond", part = "all")  %>%
  mk_par(j = "White Signature Rejection Rate", part = "body",
         value = as_paragraph(as_chunk(`White Signature Rejection Rate`, formatter = fmt_pct))) %>%
  mk_par(j = "Hispanic Signature Rejection Rate", part = "body",
         value = as_paragraph(as_chunk(`Hispanic Signature Rejection Rate`, formatter = fmt_pct))) %>%
  mk_par(j = "Hispanic Late Rejection Rate", part = "body",
         value = as_paragraph(as_chunk(`Hispanic Late Rejection Rate`, formatter = fmt_pct))) %>%
  mk_par(j = "White Late Rejection Rate", part = "body",
         value = as_paragraph(as_chunk(`White Late Rejection Rate`, formatter = fmt_pct))) %>%
  mk_par(j = "Hispanic Delta", part = "body",
         value = as_paragraph(as_chunk(`Hispanic Delta`, formatter = fmt_pct))) %>%
  mk_par(j = "White Delta", part = "body",
         value = as_paragraph(as_chunk(`White Delta`, formatter = fmt_pct)))

ft
#Table A2 both names and grouped by election
#####################
#####################
#####################
#####################
#####################
#####################
#GROUPING BY Election
#####################
#####################
#####################
#####################
#####################
#####################
all_elections <- all_elections %>%
  mutate(
    Category = case_when(
      spanish_bothname == 1 ~ "Spanish Both Names",
      #spanish_surname == 1 | spanish_bothname == 1 ~ "Spanish Surname",
      spanish_firstname == 1 & spanish_bothname == 0  ~ "Spanish First Name",
      white_bothname == 1 ~ "White Both Names",
      #white_surname == 1 | white_bothname == 1 ~ "White Surname",
      white_firstname == 1 & white_bothname == 0  ~ "White First Name",
      asian_bothname == 1 ~ "Asian Both Names",
      asian_surname == 1 & asian_bothname == 0 ~ "Asian Surname",
      asian_firstname == 1 & asian_bothname == 0 ~ "Asian First Name",
      black_bothname == 1 ~ "Black Both Names", 
      black_surname == 1 & black_bothname == 0 ~ "Black Surname",
      black_firstname == 1 & black_bothname == 0 ~ "Black First Name"
    ))

#GROUPING BY ELECTION
all_elections_summary <- all_elections %>%
  filter(Election %in% general_elections,
         Category %in% c("Spanish Both Names", "White Both Names")) %>%
  filter(County %in% c(central_wa)) %>% #this is where I select certain counties
  group_by(Election, Category) %>%
  summarise(
    total_late_rejected = sum(too_late_rej, na.rm = TRUE),
    total_sig_rejected = sum(sig_mismatch_rej, na.rm = TRUE),
    total_ballots = sum(total_votes, na.rm = TRUE),
    late_rate = total_late_rejected / total_ballots,
    sig_rate = total_sig_rejected / total_ballots
  ) %>%
  pivot_wider(
    names_from = Category,
    values_from = c(total_late_rejected, total_sig_rejected, total_ballots, late_rate, sig_rate),
    names_glue = "{.value}_{Category}"
  ) %>%
  mutate(
    late_rejection_ratio = `late_rate_Spanish Both Names` / `late_rate_White Both Names`,
    sig_rejection_ratio = `sig_rate_Spanish Both Names` / `sig_rate_White Both Names`,
    delta_hispanic = abs(`sig_rate_Spanish Both Names`-`late_rate_Spanish Both Names`),
    delta_white = abs(`sig_rate_White Both Names` -`late_rate_White Both Names`)
  ) %>%
  ungroup() # Ungroup to make it a single data frame with one row per election

# Create a row that combines all elections
combined_row <- all_elections %>%
  filter(Election %in% general_elections,
         Category %in% c("Spanish Both Names", "White Both Names")) %>%
  filter(County %in% c(central_wa)) %>%
  group_by(Category) %>%
  summarise(
    total_late_rejected = sum(too_late_rej, na.rm = TRUE),
    total_sig_rejected = sum(sig_mismatch_rej, na.rm = TRUE),
    total_ballots = sum(total_votes, na.rm = TRUE),
    late_rate = total_late_rejected / total_ballots,
    sig_rate = total_sig_rejected / total_ballots
  ) %>%
  pivot_wider(
    names_from = Category,
    values_from = c(total_late_rejected, total_sig_rejected, total_ballots, late_rate, sig_rate),
    names_glue = "{.value}_{Category}"
  ) %>%
  mutate(
    late_rejection_ratio = `late_rate_Spanish Both Names` / `late_rate_White Both Names`,
    sig_rejection_ratio = `sig_rate_Spanish Both Names` / `sig_rate_White Both Names`,
    delta_hispanic = abs(`sig_rate_Spanish Both Names`-`late_rate_Spanish Both Names`),
    delta_white = abs(`sig_rate_White Both Names` -`late_rate_White Both Names`),
    Election = "All Elections"
  )


# Bind the combined row to the summary table
all_elections_summary <- bind_rows(all_elections_summary, combined_row)

# Function to run a two-proportion z-test and return p-value and z-score
two_prop_z_test <- function(delta1, delta2, n1, n2) {
  # Calculate the standard error for the difference in proportions
  se_diff <- sqrt((delta1 * (1 - delta1) / n1) + (delta2 * (1 - delta2) / n2))
  # Calculate the z-score
  z_score <- (delta1 - delta2) / se_diff
  # Calculate the p-value (two-tailed test)
  p_value <- 2 * (1 - pnorm(abs(z_score)))
  return(c(p_value, z_score))
}

# Apply the z-test for each row
county_summary <- all_elections_summary %>%
  rowwise() %>%
  mutate(
    # Test for difference between the deltas (White vs. Hispanic)
    test_results = list(two_prop_z_test(
      `delta_white`, `delta_hispanic`, 
      `total_ballots_White Both Names`, `total_ballots_Spanish Both Names`
    )),
    # Extract p-value and z-score from the result list
    prop_test_p_value = test_results[[1]],
    z_score = test_results[[2]]
  ) %>%
  ungroup() %>%
  filter(`total_sig_rejected_White Both Names` > 1 & `total_sig_rejected_Spanish Both Names`> 1 & 
           `total_late_rejected_White Both Names` > 1 & 
           `total_late_rejected_Spanish Both Names` > 1) %>%
  select(
    Election,
    #County,
    `White Total Ballots` = `total_ballots_White Both Names`,
    `Hispanic Total Ballots` = `total_ballots_Spanish Both Names`,
    `White Late Rejection Rate` = `late_rate_White Both Names`,
    `White Signature Rejection Rate` = `sig_rate_White Both Names` ,
    `White Delta` = delta_white,
    `Hispanic Late Rejection Rate` = `late_rate_Spanish Both Names`,
    `Hispanic Signature Rejection Rate` = `sig_rate_Spanish Both Names`,
    `Hispanic Delta` = delta_hispanic, 
    `White:Hispanic Late Rejection Ratio` = late_rejection_ratio,
    `White:Hispanic Signature Rejection Ratio` = sig_rejection_ratio,
    #`Z Score` = z_score,
    `p-value (Two-Delta Z-test)` = prop_test_p_value
    
  )

# Round numeric columns 
county_summary[] <- lapply(county_summary, function(x) {
  if (is.numeric(x)) round(x, 3) else x
})

#order counties and make table
ft <- county_summary %>%
  mutate(Election = factor(
    Election,
    levels = c(
      "All Elections", "General Nov  5 2019",
      "General Nov  3 2020",
      "General Nov  2 2021",
      "General Nov  8 2022",
      "General Nov  7 2023",
      "General Nov  5 2024"))) %>%
  
  arrange(Election) %>%
  flextable() %>%
  fontsize(size = 8, part = "all") %>%
  font(fontname = "EB Garamond", part = "all")  %>%
  mk_par(j = "White Signature Rejection Rate", part = "body",
         value = as_paragraph(as_chunk(`White Signature Rejection Rate`, formatter = fmt_pct))) %>%
  mk_par(j = "Hispanic Signature Rejection Rate", part = "body",
         value = as_paragraph(as_chunk(`Hispanic Signature Rejection Rate`, formatter = fmt_pct))) %>%
  mk_par(j = "Hispanic Late Rejection Rate", part = "body",
         value = as_paragraph(as_chunk(`Hispanic Late Rejection Rate`, formatter = fmt_pct))) %>%
  mk_par(j = "White Late Rejection Rate", part = "body",
         value = as_paragraph(as_chunk(`White Late Rejection Rate`, formatter = fmt_pct))) %>%
  mk_par(j = "Hispanic Delta", part = "body",
         value = as_paragraph(as_chunk(`Hispanic Delta`, formatter = fmt_pct))) %>%
  mk_par(j = "White Delta", part = "body",
         value = as_paragraph(as_chunk(`White Delta`, formatter = fmt_pct)))

ft

#FIGURES
################
################
################
################
################
################
################
################
#Line CHART
#GROUPING BY ELECTION
all_elections_summary <- all_elections %>%
  filter(Election %in% general_elections,
         Category %in% c("Spanish Surname", "White Surname")) %>%
  filter(County %in% c(central_wa)) %>% #this is where I select certain counties
  group_by(Election, Category) %>%
  summarise(
    total_late_rejected = sum(too_late_rej, na.rm = TRUE),
    total_sig_rejected = sum(sig_mismatch_rej, na.rm = TRUE),
    total_ballots = sum(total_votes, na.rm = TRUE),
    late_rate = total_late_rejected / total_ballots,
    sig_rate = total_sig_rejected / total_ballots
  ) %>%
  pivot_wider(
    names_from = Category,
    values_from = c(total_late_rejected, total_sig_rejected, total_ballots, late_rate, sig_rate),
    names_glue = "{.value}_{Category}"
  ) %>%
  mutate(
    late_rejection_ratio = `late_rate_Spanish Surname` / `late_rate_White Surname`,
    sig_rejection_ratio = `sig_rate_Spanish Surname` / `sig_rate_White Surname`,
    delta_hispanic = abs(`sig_rate_Spanish Surname`-`late_rate_Spanish Surname`),
    delta_white = abs(`sig_rate_White Surname` -`late_rate_White Surname`)
  ) %>%
  ungroup() # Ungroup to make it a single data frame with one row per election




all_elections_summary <- all_elections_summary %>%
  mutate(
    se_late_rate_spanish = sqrt((`late_rate_Spanish Surname` * (1 - `late_rate_Spanish Surname`)) / `total_ballots_Spanish Surname`),
    se_late_rate_white   = sqrt((`late_rate_White Surname` * (1 - `late_rate_White Surname`)) / `total_ballots_White Surname`),
    se_sig_rate_spanish = sqrt((`sig_rate_Spanish Surname` * (1 - `sig_rate_Spanish Surname`)) / `total_ballots_Spanish Surname`),
    se_sig_rate_white   = sqrt((`sig_rate_White Surname` * (1 - `sig_rate_White Surname`)) / `total_ballots_White Surname`)
  ) %>%
  mutate(Election = factor(
    Election,
    levels = c(
      "All Elections", "General Nov  5 2019",
      "General Nov  3 2020",
      "General Nov  2 2021",
      "General Nov  8 2022",
      "General Nov  7 2023",
      "General Nov  5 2024")))


long_plot_df <- all_elections_summary %>%
  select(Election,
         `sig_rate_Spanish Surname`, `sig_rate_White Surname`,
         se_sig_rate_spanish, se_sig_rate_white) %>%
  #`sig_rate_Spanish Surname`, `sig_rate_White Surname`,
  #se_sig_rate_spanish, se_sig_rate_white) %>%
  pivot_longer(cols = c(`sig_rate_Spanish Surname`, `sig_rate_White Surname`),
               names_to = "Category",
               values_to = "sig_rate") %>%
  mutate(
    se = case_when(
      Category == "sig_rate_Spanish Surname" ~ se_sig_rate_spanish,
      Category == "sig_rate_White Surname" ~ se_sig_rate_white
    ),
    Category = recode(Category,
                      `sig_rate_Spanish Surname` = "Hispanic Surname",
                      `sig_rate_White Surname` = "White Surname")
  )


ggplot(long_plot_df %>% filter(Election != "All Elections"), 
       aes(x = Election, y = sig_rate, color = Category, linetype = Category, group = Category)) +
  geom_point(size = 2) +
  geom_line(linewidth = 1) +
  geom_errorbar(aes(ymin = sig_rate - 1.96 * se, ymax = sig_rate + 1.96 * se), width = 0.2) +
  labs(
    #title = "Sig Rejection Rates by Election",
    x = "Election",
    y = "Rejection Rate",
    color = "Category",
    linetype = "Category"
  ) +
  scale_color_manual(values = c("White Surname" = "red", "Hispanic Surname" = "blue")) +
  scale_linetype_manual(values = c("White Surname" = "solid", "Hispanic Surname" = "dashed")) +
  scale_y_continuous(
    labels = scales::percent_format(scale = 100),
    limits = c(0, 0.02),
    breaks = seq(0, 0.02, by = 0.001)
  ) +
  scale_x_discrete(labels = c(
    "General Nov 5 2019" = "2019G Nov 5",
    "General Nov 3 2020" = "2020G Nov 3",
    "General Nov 2 2021" = "2021G Nov 2",
    "General Nov 8 2022" = "2022G Nov 8",
    "General Nov 7 2023" = "2023G Nov 7",
    "General Nov 5 2024" = "2024G Nov 5"
  )) +
  theme_minimal(base_family = "EB Garamond") +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 22),
    axis.text.y = element_text(size = 22),
    axis.title.x = element_text(size = 24),
    axis.title.y = element_text(size = 24),
    plot.title = element_text(size = 26, face = "bold"),
    legend.text = element_text(size = 22),
    legend.title = element_blank()
  ) + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels

